# Uncomment to upgrade packages
#!pip3 install pandas --user --upgrade --quiet
#!pip3 install scipy --user --upgrade --quiet
#!pip3 install numpy --user --upgrade --quiet
#!pip3 install statsmodels --user --upgrade --quiet
#!pip3 install seaborn --user --upgrade --quiet
#%matplotlib notebook
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn
import pandas as pd
from collections import Counter
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
# extra imports
from pandas import read_csv
from scipy.stats import boxcox, shapiro, chi2, chi2_contingency
from matplotlib import patches
import seaborn as sns
def print_categorical_variables(df):
categorical_vars = df.select_dtypes(include=['object', 'category']).columns
print("Categorical Variables:")
for var in categorical_vars:
categories = df[var].unique().tolist()
print(f"{var}: {', '.join(categories)}")
def print_numerical_variables(df):
numerical_vars = df.select_dtypes(exclude=['object', 'category']).columns
print("Numerical Variables:")
for var in numerical_vars:
print(var)
def split_cat_num_columns(df):
#Split for further analysis between categorical and numerical variables
cat_cols = []
num_cols = []
for col in df.columns:
if df[col].dtype in ['object', 'category']:
cat_cols.append(col)
else:
num_cols.append(col)
return cat_cols, num_cols
def plot_dataframe(df, x=4, y =4):
cat_cols, _ = split_cat_num_columns(df)
# Define custom color palette
colors = ['#648E9C', '#9CB1BC', '#C5D4DE', '#E8F1F4']
# Create figure and axes
fig, axes = plt.subplots(x, y, figsize=(18, 16))
# Set axis labels font properties
font_props = {'fontsize': 10}
# Plot each variable
for i, col in enumerate(df.columns):
ax = axes.reshape(-1)[i]
if col in cat_cols:
sns.countplot(x=col, data=df, ax=ax, palette=colors)
ax.set_xlabel(col, fontdict=font_props)
ax.set_ylabel('Count', fontdict=font_props)
# Add numbers on top of the bars
for p in ax.patches:
ax.annotate(f'{p.get_height()}', (p.get_x() + p.get_width() / 2., p.get_height()),
ha='center', va='bottom', xytext=(0, 5), textcoords='offset points', fontsize=8)
else:
sns.histplot(x=col, data=df, ax=ax, color=colors[0], kde=True, stat="density")
kde_color = '#9C648E' # Desired color for the Gaussian curves
sns.kdeplot(x=col, data=df, ax=ax, color=kde_color, lw=1.5)
ax.set_xlabel(col, fontdict=font_props)
ax.set_ylabel('Density', fontdict=font_props)
# Rotate x-axis labels
ax.tick_params(axis='x', labelrotation=45)
ax.tick_params(axis='both', labelsize=8)
# Adjust y-axis limit to leave space for the numbers
ax.set_ylim(0, ax.get_ylim()[1] * 1.15)
# Adjust spacing between subplots
plt.tight_layout(pad=2.0)
# Display the plot without the messages
plt.show();
def calculate_outliers(data, column):
q1 = np.percentile(data[column], 25)
q3 = np.percentile(data[column], 75)
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr
outliers = data[column][(data[column] < lower_bound) | (data[column] > upper_bound)]
extreme_outliers = data[column][(data[column] < lower_bound - 3 * iqr) | (data[column] > upper_bound + 3 * iqr)]
return q1, q3, iqr, lower_bound, upper_bound, outliers, extreme_outliers
def plot_boxplot_histogram(data, column):
# Define custom color palette
colors = ['#648E9C', '#9CB1BC', '#C5D4DE', '#E8F1F4']
# Calculate outliers and extreme outliers
q1, q3, iqr, lower_bound, upper_bound, outliers, extreme_outliers = calculate_outliers(data, column)
# Create subplots with custom width ratios and figure size
fig, axes = plt.subplots(1, 2, gridspec_kw={'width_ratios': [1, 4]}, figsize=(9, 5))
# Box plot
boxplot = axes[0].boxplot(data[column], showfliers=True)
axes[0].set_title(f'{column} - Box Plot', fontsize=12)
axes[0].set_ylabel(f'{column}', fontsize=10)
# Plot the boundary lines for outliers in the box plot if they exist
if outliers.any():
axes[0].axhline(lower_bound, color='#9C648E', linestyle='--')
axes[0].axhline(upper_bound, color='#9C648E', linestyle='--')
# Plot the boundary lines for extreme outliers in the box plot if they exist
if extreme_outliers.any():
axes[0].axhline(lower_bound - 3 * iqr, color='#3F51B5', linestyle='--')
axes[0].axhline(upper_bound + 3 * iqr, color='#3F51B5', linestyle='--')
# Change color of extreme outliers to blue
for flier in boxplot['fliers']:
flier.set(marker='o', color='#3F51B5', alpha=0.5)
# Histogram
hist = sns.histplot(data=data, x=column, ax=axes[1], color=colors[0])
axes[1].set_title(f'{column} - Histogram', fontsize=12)
axes[1].set_xlabel(f'{column}', fontsize=10)
axes[1].set_ylabel('Frequency', fontsize=10)
# Plot the boundary lines for outliers in the histogram if they exist
if outliers.any():
hist.axvline(lower_bound, color='#9C648E', linestyle='--')
hist.axvline(upper_bound, color='#9C648E', linestyle='--')
# Plot the boundary lines for extreme outliers in the histogram if they exist
if extreme_outliers.any():
hist.axvline(lower_bound - 3 * iqr, color='#3F51B5', linestyle='--')
hist.axvline(upper_bound + 3 * iqr, color='#3F51B5', linestyle='--')
# Create legend for the plot
legend_elements = [
plt.Line2D([0], [0], marker='o', color='#9C648E', linestyle='--', markersize=5, label='Outlier Boundary'),
plt.Line2D([0], [0], marker='o', color='#3F51B5', linestyle='--', markersize=5, alpha=0.5, label='Extreme Outlier')
]
axes[1].legend(handles=legend_elements, loc='upper right')
# Adjust tick label font size for both subplots
for ax in axes:
ax.tick_params(axis='both', labelsize=8)
# Adjust spacing between subplots
plt.tight_layout()
# Display the plot
plt.show();
def print_outlier_analysis(column, q1, q3, iqr, lower_bound, upper_bound, outliers, extreme_outliers):
print("{} Outlier Analysis:".format(column))
print("-----------------------------")
print("First Quartile (Q1): {:.2f}".format(q1))
print("Third Quartile (Q3): {:.2f}".format(q3))
print("Interquartile Range (IQR): {:.2f}".format(iqr))
print("Lower Bound: {:.2f}".format(lower_bound))
print("Upper Bound: {:.2f}".format(upper_bound))
print(f"Outliers Length: {len(outliers)}")
print(f"Extreme Outliers Length: {len(extreme_outliers)}")
def bivariate_numerical_exploratory_analysis(df, target):
plt.rcParams['font.size']= 10
colors = ['#648E9C', '#9C648E']
sns.pairplot(df, hue=target, palette=colors, plot_kws={'alpha': 0.75});
def plot_categorical_variables(data, target):
categorical_vars = data.select_dtypes(include=['object', 'category']).columns
num_plots = len(categorical_vars)
num_cols = 3 # Number of columns in the subplot grid
num_rows = (num_plots - 1) // num_cols + 1
fig, axes = plt.subplots(num_rows, num_cols, figsize=(15, 5*num_rows))
axes = axes.flatten()
for i, var in enumerate(categorical_vars):
sns.countplot(x=var, hue=target, data=data, ax=axes[i], palette=["#648E9C", "#9C648E"])
axes[i].set_title(f'{var} vs {target}')
axes[i].set_xlabel(var)
axes[i].set_ylabel('Count')
axes[i].set_xticklabels(axes[i].get_xticklabels(), rotation=45)
axes[i].legend(title=target)
# Hide empty subplots
for j in range(i + 1, num_plots):
fig.delaxes(axes[j])
plt.tight_layout()
plt.show();
def correlation_heatmap(df):
numeric_cols = df.select_dtypes(include='number') # Select only numeric columns
correlation = numeric_cols.corr()
mask = np.triu(np.ones_like(correlation, dtype=bool))
cmap = sns.color_palette(['#648E9C', '#9C648E'])
plt.figure(figsize=(8, 6))
sns.heatmap(correlation, annot=True, cmap=cmap, linewidths=0.5)
plt.title('Correlation Matrix')
# Rotate variable labels
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=45, ha='right')
plt.show();
def generate_cross_tabulations(df, target_var):
categorical_vars = df.select_dtypes(include=['object', 'category']).columns
for var in categorical_vars:
crosstab = pd.crosstab(df[var], df[target_var])
row_sums = crosstab.sum(axis=0)
col_sums = crosstab.sum(axis=1)
print(f"Cross-tabulation for {var} and {target_var}:")
print(crosstab)
print("Row sums:")
print(row_sums)
print("Column sums:")
print(col_sums)
print("\n")
def perform_chi_squared_test(df, cat_cols):
results = []
for col1 in cat_cols:
for col2 in cat_cols:
if col1 != col2:
contingency_table = pd.crosstab(df[col1], df[col2])
chi2, p_value, _, _ = chi2_contingency(contingency_table)
results.append((col1, col2, chi2, p_value))
results_df = pd.DataFrame(results, columns=['Variable 1', 'Variable 2', 'Chi-square', 'P-value'])
return results_df
Breast_Cancer.csv¶In this first part we are going to load the dataset, explore it and get some first insights.
breast_cancer = read_csv("./Breast_Cancer.csv", header=0, delimiter=',')
breast_cancer.shape
(4024, 16)
As we can see, the dataset has 4024 observations and 16 columns (variables). Our target value is the 16th, which is the Status of the patient and it can be binary (Dead or Alive).
breast_cancer.columns
Index(['Age', 'Race', 'Marital Status', 'T Stage ', 'N Stage', '6th Stage',
'differentiate', 'Grade', 'A Stage', 'Tumor Size', 'Estrogen Status',
'Progesterone Status', 'Regional Node Examined',
'Reginol Node Positive', 'Survival Months', 'Status'],
dtype='object')
breast_cancer.rename(columns={"Reginol Node Positive" : "Regional Node Positive"}, inplace=True) # Fixing typo of the column name
Below we can see the first 5 rows of the dataset in order to have a brief overview of the available data.
breast_cancer.head()
| Age | Race | Marital Status | T Stage | N Stage | 6th Stage | differentiate | Grade | A Stage | Tumor Size | Estrogen Status | Progesterone Status | Regional Node Examined | Regional Node Positive | Survival Months | Status | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 68 | White | Married | T1 | N1 | IIA | Poorly differentiated | 3 | Regional | 4 | Positive | Positive | 24 | 1 | 60 | Alive |
| 1 | 50 | White | Married | T2 | N2 | IIIA | Moderately differentiated | 2 | Regional | 35 | Positive | Positive | 14 | 5 | 62 | Alive |
| 2 | 58 | White | Divorced | T3 | N3 | IIIC | Moderately differentiated | 2 | Regional | 63 | Positive | Positive | 14 | 7 | 75 | Alive |
| 3 | 58 | White | Married | T1 | N1 | IIA | Poorly differentiated | 3 | Regional | 18 | Positive | Positive | 2 | 1 | 84 | Alive |
| 4 | 47 | White | Married | T2 | N1 | IIB | Poorly differentiated | 3 | Regional | 41 | Positive | Positive | 3 | 1 | 50 | Alive |
To begin with, a quick look at the variables types is taking place.
breast_cancer.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 4024 entries, 0 to 4023 Data columns (total 16 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Age 4024 non-null int64 1 Race 4024 non-null object 2 Marital Status 4024 non-null object 3 T Stage 4024 non-null object 4 N Stage 4024 non-null object 5 6th Stage 4024 non-null object 6 differentiate 4024 non-null object 7 Grade 4024 non-null object 8 A Stage 4024 non-null object 9 Tumor Size 4024 non-null int64 10 Estrogen Status 4024 non-null object 11 Progesterone Status 4024 non-null object 12 Regional Node Examined 4024 non-null int64 13 Regional Node Positive 4024 non-null int64 14 Survival Months 4024 non-null int64 15 Status 4024 non-null object dtypes: int64(5), object(11) memory usage: 503.1+ KB
As it was mentioned in the dataset's metadata, there are in total 16 variables, 5 of which are numerical and 11 categorical. Also, as it can be seen in the results of the previous command, there are not null values (missing values) included in the dataset. However, we need to take a look at the distribution of the numerical variables, as well as the categories of the categorical variables in order to be sure that the dataset does not contain erroneous data.
breast_cancer.describe()
| Age | Tumor Size | Regional Node Examined | Regional Node Positive | Survival Months | |
|---|---|---|---|---|---|
| count | 4024.000000 | 4024.000000 | 4024.000000 | 4024.000000 | 4024.000000 |
| mean | 53.972167 | 30.473658 | 14.357107 | 4.158052 | 71.297962 |
| std | 8.963134 | 21.119696 | 8.099675 | 5.109331 | 22.921430 |
| min | 30.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
| 25% | 47.000000 | 16.000000 | 9.000000 | 1.000000 | 56.000000 |
| 50% | 54.000000 | 25.000000 | 14.000000 | 2.000000 | 73.000000 |
| 75% | 61.000000 | 38.000000 | 19.000000 | 5.000000 | 90.000000 |
| max | 69.000000 | 140.000000 | 61.000000 | 46.000000 | 107.000000 |
The results here indicate that the numerical variables do not include any abnormally high maximums (e.g. 9999999), thus we can consider that the numerical variables luck of errors. Although, from the statistics calculated one can understand that there are some outliers in some of the variables. For example in variable Regional Node Positive, the value 46 is extremely far from the rest of the distribution. Outlier analysis is performed in the following sections.
As for the categorical variables by executing the following command, one can take a look at the available categories of the variables.
print_categorical_variables(breast_cancer)
Categorical Variables: Race: White, Black, Other Marital Status: Married, Divorced, Single , Widowed, Separated T Stage : T1, T2, T3, T4 N Stage: N1, N2, N3 6th Stage: IIA, IIIA, IIIC, IIB, IIIB differentiate: Poorly differentiated, Moderately differentiated, Well differentiated, Undifferentiated Grade: 3, 2, 1, anaplastic; Grade IV A Stage: Regional, Distant Estrogen Status: Positive, Negative Progesterone Status: Positive, Negative Status: Alive, Dead
From the result, it can be concluded that the categorical variables, do not contain erroneous categories. Thus, missing or erroneous data is not present in this dataset. In case of missing or erroneous data in the dataset, imputation techniques need to be considered. However, if the percentage of erroneous or missing data for a specific variable is large enough (e.g. 70%) then the best solution would be to not consider it part of the analysis.
Just to be sure, the dropna command is being used, but it can be seen that not a single row is discarded, as the shape of the dataset remains the same as the origin one.
breast_cancer.dropna(inplace=True)
breast_cancer.shape
(4024, 16)
Finally, a brief description of the available variables is taking place:
Numerical Variables:
Categorical Variables:
To begin with, we are deviding into two separate lists the column names of the numerical and categorical variables, respectively.
print_categorical_variables(breast_cancer)
print("\n")
print_numerical_variables(breast_cancer)
Categorical Variables: Race: White, Black, Other Marital Status: Married, Divorced, Single , Widowed, Separated T Stage : T1, T2, T3, T4 N Stage: N1, N2, N3 6th Stage: IIA, IIIA, IIIC, IIB, IIIB differentiate: Poorly differentiated, Moderately differentiated, Well differentiated, Undifferentiated Grade: 3, 2, 1, anaplastic; Grade IV A Stage: Regional, Distant Estrogen Status: Positive, Negative Progesterone Status: Positive, Negative Status: Alive, Dead Numerical Variables: Age Tumor Size Regional Node Examined Regional Node Positive Survival Months
Once the variables have been separated into categorical and numerical, it is possible to create a single plot containing all the histograms of the numerical variables and the countplots of the categorical variables. With the following visualisation, an initial overview of the distribution of the available variables is possible.
plot_dataframe(breast_cancer)
After analyzing the figures above, we can conclude that almost all of the variables have an unbalanced distribution. More speciffically:
Categorical Variables:
Race: Most of the women in the dataset are White leaving the other categories of those variables with comparable less observations.Marital Status: Again, most of the women in the dataset are Married and the difference between the groups is huge (2643 married compared to 45 separated).T Stage: Most of the available data refer to cases of T1 and T2, while T3 and T4 are less.N Stage: Dame is true here for the category N1.6th Stage: In this case only the category IIIB lacks observations.differentiate: The four categories are totally unbalanced, while the undifferentiate one contains only 19 out of 4024.Grade : The same is true here for the category anaplastic; GradeIV.A Stage: Distant alues are only 92 out of 4024.Estrogen Status: Same is true here for category Negative.Progesterone Status: Here as well for Negative values.Status (Target Variable): The distribution is unbalacned for the target variable as well, since we obtain 616observations for Dead cases and 3408 for Alive.Numerical Variables:
Age: The distribution of age is quite fine, since there are several observation for mostly all the ages between 30 and 70 years old.Tumor Size: The distribtuion of the tumor sizes is left-skewed, meaning that there are some values that take small values, but at the same time some values are considered too high. This will lead to outliers and they need to be treated correctly.Regional Node Examined: In this case the distribution is quite fine, only some extre cases occur for very small values of the variable, and some outliers exist in the high range of x-axis.Regional Node Positive: This variable is hardly left-skewed, leading to the conlcusion that most of the observations have very small numbers for this variable, and a few include very high values.Survival Months: It seems a bit right-skewed, thus observations with small numbers of survival months (0 to 40) need to be treated appropriately.The comments mentioned here need to be addressed in order to avoid fitting a learning algorithm which will generate a model described by high bias. For this reason, below the Outliers Analysis is included.
Before moving on to the Outliers Analysis, Sharipo-normality test is performed on the numerical values. In that way it can be statistically proven which numerical variables follow a normal distribution. Later in the Bivariate Exploratory Analysis, Chi-Squared tests are performed on the categorical variables as well, in order to check correlation between them.
cat_cols, num_cols = split_cat_num_columns(breast_cancer)
#Check the normality
shapiro_results = []
for column in num_cols:
stat, p_value = shapiro(breast_cancer[column])
shapiro_results.append((column, stat, p_value))
shapiro_df = pd.DataFrame(shapiro_results, columns=['Variable', 'Statistic', 'P_Value'])
print(shapiro_df)
Variable Statistic P_Value 0 Age 0.975857 1.395776e-25 1 Tumor Size 0.841386 0.000000e+00 2 Regional Node Examined 0.959947 5.811554e-32 3 Regional Node Positive 0.653167 0.000000e+00 4 Survival Months 0.962401 4.003365e-31
The results of the Shapiro-Wilk normality test reveal that none of the numerical variables (Age, Tumor Size, Regional Node Examined, Regional Node Positive, and Survival Months) exhibit a normal distribution, as indicated by the extremely small p-values obtained. This suggests that these variables deviate significantly from a normal distribution. To mitigate potential issues during the modeling phase, it is considered to appy suitable transformations or utilizing non-parametric models that do not rely on normality assumptions.
However, taking into account our numerical variables and by looking at the generated figures, it looks like Tumor Size could be transformed into a normal distribution with the usage of logarithmic transformation or the boxcox function. Here are the results:
breast_cancer['Tumor Size'].apply(np.log10).hist(bins='auto',figsize=(8,8), color='#648E9C',alpha=0.7, edgecolor='black');
# Apply Box-Cox transformation to 'Tumor Size'
transformed_tumor_size, lambda_ = boxcox(breast_cancer['Tumor Size'])
# Create a new figure with the desired size
plt.figure(figsize=(8, 8))
# Plot the histogram of transformed 'Tumor Size'
plt.hist(transformed_tumor_size, bins='auto', color='#648E9C', alpha=0.7, edgecolor='black')
plt.xlabel('Transformed Tumor Size')
plt.ylabel('Frequency')
plt.title('Histogram of Transformed Tumor Size')
plt.show();
lambda_
0.10735796756858493
Since the lambda variable is equal to 0.10 it means that the transformation needed to be applied to Tumor Size is the cube root, meaning boxcox is prefered compared to logarithmic transformation. Although, the outliers analysis and the extraction of new features will take place before we move on to preprocessing and normalizing the dataset. This decision is taken in order to check the correctness of the data, but in order to devide the preprocessing and normalization of the training and test data. In that way, we can be sure about the results of the model on the test data that would indicate if the model is able to generalize well or not, in new unseen cases.
During the development and training of learning algorithms the inclusion of outliers can result in high bias results, thus it is very important to treat them carefully, first by identifying them and finally impute, remove, or just acknowledge their existence. The analysis is completed for all the numerical variables of the dataset. Firstly, Tumor Size is being analyzed.
plot_boxplot_histogram(breast_cancer, 'Tumor Size')
ts_q1, ts_q3, ts_iqr, ts_lower_bound, ts_upper_bound, ts_outliers, ts_extreme_outliers = calculate_outliers(breast_cancer, 'Tumor Size')
print_outlier_analysis('Tumor Size', ts_q1, ts_q3, ts_iqr, ts_lower_bound, ts_upper_bound, ts_outliers, ts_extreme_outliers)
Tumor Size Outlier Analysis: ----------------------------- First Quartile (Q1): 16.00 Third Quartile (Q3): 38.00 Interquartile Range (IQR): 22.00 Lower Bound: -17.00 Upper Bound: 71.00 Outliers Length: 222 Extreme Outliers Length: 5
#ts_extreme_outliers
breast_cancer.loc[ts_extreme_outliers.index]
| Age | Race | Marital Status | T Stage | N Stage | 6th Stage | differentiate | Grade | A Stage | Tumor Size | Estrogen Status | Progesterone Status | Regional Node Examined | Regional Node Positive | Survival Months | Status | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 289 | 41 | White | Married | T3 | N3 | IIIC | Poorly differentiated | 3 | Regional | 140 | Positive | Positive | 41 | 15 | 51 | Dead |
| 740 | 49 | White | Married | T3 | N1 | IIIA | Moderately differentiated | 2 | Regional | 140 | Positive | Positive | 14 | 2 | 48 | Alive |
| 1007 | 60 | White | Divorced | T3 | N2 | IIIA | Moderately differentiated | 2 | Regional | 140 | Positive | Positive | 21 | 5 | 57 | Alive |
| 1512 | 63 | White | Married | T4 | N2 | IIIB | Moderately differentiated | 2 | Regional | 140 | Positive | Positive | 9 | 8 | 89 | Alive |
| 3965 | 47 | White | Married | T3 | N2 | IIIA | Well differentiated | 1 | Regional | 140 | Positive | Positive | 23 | 7 | 64 | Alive |
extreme_outliers_tumor_size = breast_cancer.loc[ts_extreme_outliers.index].copy().reset_index()
breast_cancer.drop(ts_extreme_outliers.index, inplace=True)
breast_cancer.reset_index(drop=True, inplace=True)
breast_cancer.shape
(4019, 16)
So here for the variable Tumor Size we can see that there 222 outliers in total (starting from values of 71), from which 5 are considered extreme outliers (having size equal to 140). By taking a look at the observations of extreme outlliers for Tumor Size, we consider those cases as extreme, thus they are going to be separated from the main dataset. Additionaly analysis about those cases will be completed during the modelling part of the project. The analysis is continued by including Regional Node Examined.
plot_boxplot_histogram(breast_cancer, 'Regional Node Examined')
rne_q1, rne_q3, rne_iqr, rne_lower_bound, rne_upper_bound, rne_outliers, rne_extreme_outliers = calculate_outliers(breast_cancer, 'Regional Node Examined')
print_outlier_analysis('Regional Node Examined', rne_q1, rne_q3, rne_iqr, rne_lower_bound, rne_upper_bound, rne_outliers, rne_extreme_outliers)
Regional Node Examined Outlier Analysis: ----------------------------- First Quartile (Q1): 9.00 Third Quartile (Q3): 19.00 Interquartile Range (IQR): 10.00 Lower Bound: -6.00 Upper Bound: 34.00 Outliers Length: 71 Extreme Outliers Length: 0
rne_outliers.describe()
count 71.000000 mean 41.211268 std 6.313060 min 35.000000 25% 36.000000 50% 39.000000 75% 44.500000 max 61.000000 Name: Regional Node Examined, dtype: float64
For Regional Node Examined it can be seen that there are not any extreme outliers, while only 72 values are considered as outliers, but as the values seem logical we are keeping all the information in the dataset. Let's move on to Regional Node Positive.
plot_boxplot_histogram(breast_cancer, 'Regional Node Positive')
rnp_q1, rnp_q3, rnp_iqr, rnp_lower_bound, rnp_upper_bound, rnp_outliers, rnp_extreme_outliers = calculate_outliers(breast_cancer, 'Regional Node Positive')
print_outlier_analysis('Regional Node Positive', rnp_q1, rnp_q3, rnp_iqr, rnp_lower_bound, rnp_upper_bound, rnp_outliers, rnp_extreme_outliers)
Regional Node Positive Outlier Analysis: ----------------------------- First Quartile (Q1): 1.00 Third Quartile (Q3): 5.00 Interquartile Range (IQR): 4.00 Lower Bound: -5.00 Upper Bound: 11.00 Outliers Length: 343 Extreme Outliers Length: 54
rnp_extreme_outliers.describe()
count 54.000000 mean 28.592593 std 4.478538 min 24.000000 25% 26.000000 50% 27.500000 75% 29.750000 max 46.000000 Name: Regional Node Positive, dtype: float64
For Regional Node Positive there are 54 extreme outliers (with values more than 23), while 344 values are considered as outliers. By checking the extreme outliers, we can conlude that those extreme cases, are not so extreme afterall, so we will consider them for the training of the model. Lastly Survival Months is analyzed.
plot_boxplot_histogram(breast_cancer, 'Survival Months')
sm_q1, sm_q3, sm_iqr, sm_lower_bound, sm_upper_bound, sm_outliers, sm_extreme_outliers = calculate_outliers(breast_cancer, 'Survival Months')
print_outlier_analysis('Survival Months', sm_q1, sm_q3, sm_iqr, sm_lower_bound, sm_upper_bound, sm_outliers, sm_extreme_outliers)
Survival Months Outlier Analysis: ----------------------------- First Quartile (Q1): 56.00 Third Quartile (Q3): 90.00 Interquartile Range (IQR): 34.00 Lower Bound: 5.00 Upper Bound: 141.00 Outliers Length: 18 Extreme Outliers Length: 0
breast_cancer.loc[sm_outliers.index]
| Age | Race | Marital Status | T Stage | N Stage | 6th Stage | differentiate | Grade | A Stage | Tumor Size | Estrogen Status | Progesterone Status | Regional Node Examined | Regional Node Positive | Survival Months | Status | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 413 | 55 | White | Married | T1 | N1 | IIA | Moderately differentiated | 2 | Regional | 15 | Positive | Positive | 9 | 1 | 3 | Alive |
| 678 | 62 | White | Married | T2 | N2 | IIIA | Moderately differentiated | 2 | Regional | 25 | Positive | Positive | 13 | 4 | 4 | Dead |
| 720 | 49 | White | Married | T2 | N3 | IIIC | Moderately differentiated | 2 | Regional | 32 | Positive | Positive | 20 | 11 | 3 | Alive |
| 894 | 67 | White | Married | T3 | N2 | IIIA | Poorly differentiated | 3 | Regional | 55 | Positive | Positive | 9 | 9 | 4 | Dead |
| 919 | 43 | Other | Married | T2 | N3 | IIIC | Moderately differentiated | 2 | Regional | 40 | Positive | Positive | 19 | 11 | 1 | Alive |
| 926 | 64 | White | Single | T2 | N1 | IIB | Moderately differentiated | 2 | Regional | 22 | Positive | Positive | 1 | 1 | 3 | Dead |
| 1039 | 64 | White | Divorced | T2 | N2 | IIIA | Moderately differentiated | 2 | Regional | 25 | Positive | Positive | 9 | 4 | 4 | Dead |
| 1153 | 67 | White | Married | T2 | N1 | IIB | Poorly differentiated | 3 | Regional | 25 | Positive | Positive | 4 | 1 | 2 | Dead |
| 1698 | 59 | White | Single | T3 | N1 | IIIA | Moderately differentiated | 2 | Regional | 70 | Positive | Positive | 9 | 1 | 4 | Dead |
| 1705 | 63 | White | Married | T2 | N2 | IIIA | Moderately differentiated | 2 | Regional | 35 | Positive | Positive | 21 | 5 | 3 | Dead |
| 1727 | 61 | Black | Widowed | T2 | N3 | IIIC | Poorly differentiated | 3 | Regional | 47 | Positive | Positive | 21 | 21 | 4 | Dead |
| 1747 | 46 | White | Divorced | T1 | N1 | IIA | Moderately differentiated | 2 | Regional | 19 | Positive | Positive | 26 | 1 | 2 | Dead |
| 1889 | 49 | White | Divorced | T2 | N1 | IIB | Moderately differentiated | 2 | Regional | 38 | Positive | Negative | 15 | 3 | 4 | Dead |
| 2226 | 47 | Other | Married | T2 | N2 | IIIA | Moderately differentiated | 2 | Regional | 45 | Positive | Positive | 25 | 9 | 2 | Alive |
| 2406 | 69 | White | Married | T1 | N1 | IIA | Moderately differentiated | 2 | Regional | 12 | Positive | Negative | 9 | 1 | 4 | Dead |
| 3085 | 63 | White | Married | T2 | N2 | IIIA | Moderately differentiated | 2 | Regional | 26 | Positive | Positive | 28 | 9 | 4 | Dead |
| 3470 | 58 | Black | Widowed | T1 | N2 | IIIA | Moderately differentiated | 2 | Regional | 2 | Positive | Negative | 11 | 4 | 4 | Dead |
| 3799 | 53 | White | Separated | T1 | N1 | IIA | Moderately differentiated | 2 | Regional | 17 | Positive | Positive | 1 | 1 | 4 | Alive |
For Survival Months extreme outliers are not present, and only 18 observations are considered outliers, which correspond to cases that had values smaller than 5 months. Also by taking a closer look at those observations it can be derived that in most of the cases, those outliers with small values of Survival Months variable correspond to Dead cases, thus we retain those observations inside the dataset.
Followig by the Bivariate Exploratory Analysis is taking place. Partof the Bivariate Exploratory Analysis is the section Multivariate Outliers Analysis which is performed with Mahalobis distance.
In this section, a deeper look is taken into the relationship between the pairs of available variables with respect to the target variable Status.
Firstly, the relation between the numerical variables with respect to the target variable is depicted in the following figure.
bivariate_numerical_exploratory_analysis(breast_cancer, 'Status')
Before stating the colcusions of the generated figure, let's take a look at the Corellation Heatmap below as well.
correlation_heatmap(breast_cancer)
We see no strong correlation between numerical variables, some insights we can get, for example, are the following:
Age and Tumor Size, we can see that they are indipendent as the value indicated is -0.077.Regional Node Examined and Regional Node Positive indicates a moderate positive correlation of 0.412, thus, as the number of regional nodes examined increases, the number of positive nodes also tends to increase.Survival Months and other variables is relatively weak. This indicates a weak or no linear relationship between survival months and the other numerical variables in the dataset.However, by taking a look at the scatter plot of the numerical variables (before the correlation matrix), one can understand that the observations of Alive and Dead classes, are mixed when a combination of two numerical variables occurs. By that it is meant, that there is not clear separation between a pair of numerical variables for distinguishing Alive from Dead cases. The only interesting insight from the scatterplot occures for the variable Survival Months, which seems to generate a good separation boundary between the two classes, for all the combinations of the remaining numerical variables. This insight leads to the conclusion that the variable Survival Months can offer quite strong predictability power to the model.
To continue with, in the next plot, the distribution of the categorical variables with respect to the target varible is presented.
plot_categorical_variables(breast_cancer, "Status")
Before conlcuding about the categorical variables, it is necessary to statistically test if correlation exist between them by using the Chi-Squared test.
chi_squared_results = perform_chi_squared_test(breast_cancer, cat_cols)
print(chi_squared_results)
chi_squared_results.to_csv("./chi-2.csv")
Variable 1 Variable 2 Chi-square P-value 0 Race Marital Status 137.649912 7.308896e-26 1 Race T Stage 8.309501 2.162940e-01 2 Race N Stage 6.215014 1.836560e-01 3 Race 6th Stage 8.923983 3.487492e-01 4 Race differentiate 27.855377 1.000418e-04 .. ... ... ... ... 105 Status differentiate 111.262966 5.868421e-24 106 Status Grade 111.262966 5.868421e-24 107 Status A Stage 35.795768 2.191234e-09 108 Status Estrogen Status 135.274057 2.876016e-31 109 Status Progesterone Status 125.062415 4.931880e-29 [110 rows x 4 columns]
By considering the barplots above, there is not clear conclusions to be made, due to the unbalanced nature of the dataset as mentioned earlier. However, some interesting insights generated by this figure is that for the variables T Stage, N Stage and 6th Stage, it can be observed that for the categories T4, N3 and IIC, respectively, the difference between the Alive and Dead cases, tend to get smaller, which might mean that these specific categories my help the model recognise patterns for Dead cases. Finally, it can be observed that in some of them, the probability of survival increases, for example, when the patient has a T1 value for the T Stage variable.
In additoin, by taking a look at the produced csv file, the following conclusions can be derived:
Race and Marital Status: There is a significant association between race and marital status (Chi-square = 137.96, p < 0.001).Race and other variables: Race does not show a significant association with T Stage, N Stage, 6th Stage, A Stage, or Progesterone Status. However, it is significantly associated with differentiation, grade, estrogen status, and overall status (p < 0.05).Marital Status and other variables: Marital status is significantly associated with T Stage, N Stage, differentiation, grade, and overall status (p < 0.05).T Stage, N Stage, and other variables: T Stage and N Stage are strongly associated with each other (Chi-square = 323.41, p < 0.001) and show significant associations with differentiation, grade, 6th Stage, A Stage, estrogen status, progesterone status, and overall status (p < 0.05).6th Stage and other variables: 6th Stage is strongly associated with T Stage, N Stage, differentiation, grade, A Stage, estrogen status, progesterone status, and overall status (p < 0.001).Differentiation and other variables: Differentiation is significantly associated with race, Marital Status, T Stage, N Stage, 6th Stage, grade, A Stage, estrogen status, progesterone status, and overall status (p < 0.05).Grade and other variables: Grade is significantly associated with race, Marital Status, T Stage, N Stage, 6th Stage, differentiation, A Stage, estrogen status, progesterone status, and overall status (p < 0.05).A Stage and other variables: A Stage shows a significant association with differentiation, estrogen status, and overall status (p < 0.05).Estrogen Status and other variables: Estrogen status is significantly associated with race, T Stage, N Stage, 6th Stage, differentiation, grade, A Stage, progesterone status, and overall status (p < 0.05).Progesterone Status and other variables: Progesterone status is significantly associated with race, Marital Status, T Stage, N Stage, 6th Stage, differentiation, grade, estrogen status, and overall status (p < 0.05).Overall Status and other variables: Overall status is significantly associated with race, Marital Status, T Stage, N Stage, 6th Stage, differentiation, grade, A Stage, estrogen status, and progesterone status (p < 0.05).In addition, though, we'll perform a multivariate outlier detection using Mahalanobis Distance o the numerical variables of the dataset. In order to do so the calculation of the covariance matrix and its inverse is necessary in order to calculate the distances between the observations of the dataset.
df_mahalanobis = breast_cancer[num_cols]
df_mahalanobis;
df_mahalanobis = df_mahalanobis.to_numpy()
#Covariance matrix
cov_matrix = np.cov(df_mahalanobis, rowvar=False)
cov_matrix_pm1 = np.linalg.matrix_power(cov_matrix, -1)
centerpoint = np.mean(df_mahalanobis, axis=0)
The cutoff value for detecting outliers is set to 1% based on the Chi-Square distribution. Thus, 0.99 represents the desired significance level.
# Distances between center point
distances = []
for i, val in enumerate(df_mahalanobis):
p1 = val
p2 = centerpoint
distance = (p1-p2).T.dot(cov_matrix_pm1).dot(p1-p2)
distances.append(distance)
distances = np.array(distances)
# Cutoff (threshold) value from Chi-Sqaure Distribution for detecting outliers
cutoff = chi2.ppf(0.99, df_mahalanobis.shape[1])
# Index of outliers
outlierIndexes = np.where(distances > cutoff )
# print('--- Index of Outliers ----')
# print(outlierIndexes)
print('--- Number of Outliers ----')
print(len(outlierIndexes[0]))
df_multiv_outliers = df_mahalanobis[ distances > cutoff , :]
--- Number of Outliers ---- 177
breast_cancer.iloc[outlierIndexes[0]].describe(include='all')
| Age | Race | Marital Status | T Stage | N Stage | 6th Stage | differentiate | Grade | A Stage | Tumor Size | Estrogen Status | Progesterone Status | Regional Node Examined | Regional Node Positive | Survival Months | Status | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 177.000000 | 177 | 177 | 177 | 177 | 177 | 177 | 177 | 177 | 177.000000 | 177 | 177 | 177.000000 | 177.000000 | 177.000000 | 177 |
| unique | NaN | 3 | 5 | 4 | 3 | 5 | 4 | 4 | 2 | NaN | 2 | 2 | NaN | NaN | NaN | 2 |
| top | NaN | White | Married | T3 | N3 | IIIC | Moderately differentiated | 2 | Regional | NaN | Positive | Positive | NaN | NaN | NaN | Alive |
| freq | NaN | 155 | 101 | 73 | 106 | 106 | 89 | 89 | 159 | NaN | 152 | 129 | NaN | NaN | NaN | 106 |
| mean | 54.254237 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 60.474576 | NaN | NaN | 26.960452 | 15.920904 | 59.214689 | NaN |
| std | 9.893053 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 40.255303 | NaN | NaN | 12.472862 | 11.010556 | 30.326803 | NaN |
| min | 30.000000 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1.000000 | NaN | NaN | 3.000000 | 1.000000 | 2.000000 | NaN |
| 25% | 47.000000 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 23.000000 | NaN | NaN | 19.000000 | 5.000000 | 37.000000 | NaN |
| 50% | 55.000000 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 50.000000 | NaN | NaN | 26.000000 | 17.000000 | 60.000000 | NaN |
| 75% | 62.000000 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 100.000000 | NaN | NaN | 35.000000 | 25.000000 | 83.000000 | NaN |
| max | 69.000000 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 133.000000 | NaN | NaN | 61.000000 | 46.000000 | 107.000000 | NaN |
## Finding ellipse dimensions
pearson = cov_matrix[0, 1]/np.sqrt(cov_matrix[0, 0] * cov_matrix[1, 1])
ell_radius_x = np.sqrt(1 + pearson)
ell_radius_y = np.sqrt(1 - pearson)
lambda_, v = np.linalg.eig(cov_matrix)
lambda_ = np.sqrt(lambda_)
# Ellipse patch
ellipse = patches.Ellipse(xy=(centerpoint[0], centerpoint[1]),
width=lambda_[0]*np.sqrt(cutoff)*2, height=lambda_[1]*np.sqrt(cutoff)*2,
angle=np.rad2deg(np.arccos(v[0, 0])), edgecolor='#9C648E')
ellipse.set_facecolor('#648E9C') # Set the facecolor to '#9C648E'
ellipse.set_alpha(0.5)
fig = plt.figure()
ax = plt.subplot()
ax.add_artist(ellipse)
# Scatter plot
plt.scatter(df_mahalanobis[:, 0], df_mahalanobis[:, 1], color='#9C648E') # Set the scatter plot color to '#648E9C'
# Set plot title and labels
plt.title('Outlier Detection')
plt.xlabel('X')
plt.ylabel('Y')
plt.show();
As the above figure presents, based on the Mahalanobis distance and Chi-square distribution with 0.99 significance level, there are 177 multivariate outliers. After carefull consideration, by taking a look at those observations, we consider retaining them inside the dataset, since their maximum values seem logical. Although, we might return to this point in case of failures during the modelling phase.
With the following command it is clear that all categorical variables are set to be of type object.
breast_cancer.dtypes
Age int64 Race object Marital Status object T Stage object N Stage object 6th Stage object differentiate object Grade object A Stage object Tumor Size int64 Estrogen Status object Progesterone Status object Regional Node Examined int64 Regional Node Positive int64 Survival Months int64 Status object dtype: object
print_categorical_variables(breast_cancer)
Categorical Variables: Race: White, Black, Other Marital Status: Married, Divorced, Single , Widowed, Separated T Stage : T1, T2, T3, T4 N Stage: N1, N2, N3 6th Stage: IIA, IIIA, IIIC, IIB, IIIB differentiate: Poorly differentiated, Moderately differentiated, Well differentiated, Undifferentiated Grade: 3, 2, 1, anaplastic; Grade IV A Stage: Regional, Distant Estrogen Status: Positive, Negative Progesterone Status: Positive, Negative Status: Alive, Dead
For presentation reasons, we are changing the value anaplastic; Grade IV of the variable Grade to 4, and then all the values to latin numbers in order to be identical.
breast_cancer['Grade'].replace(' anaplastic; Grade IV', '4', inplace=True)
breast_cancer['Grade'].replace({'1': 'I', '2': 'II', '3': 'III', '4': 'IV'}, inplace=True)
print_categorical_variables(breast_cancer)
Categorical Variables: Race: White, Black, Other Marital Status: Married, Divorced, Single , Widowed, Separated T Stage : T1, T2, T3, T4 N Stage: N1, N2, N3 6th Stage: IIA, IIIA, IIIC, IIB, IIIB differentiate: Poorly differentiated, Moderately differentiated, Well differentiated, Undifferentiated Grade: III, II, I, IV A Stage: Regional, Distant Estrogen Status: Positive, Negative Progesterone Status: Positive, Negative Status: Alive, Dead
Now it is necessary to create an order for the variables T Stage, N Stage, 6th Stage, differentiate, and Grade.
# Define the custom order for each categorical variable
t_stage_order = ['T1', 'T2', 'T3', 'T4']
n_stage_order = ['N1', 'N2', 'N3']
stage_6_order = ['IIA', 'IIB', 'IIIA', 'IIIB', 'IIIC']
differentiate_order = ['Undifferentiated', 'Poorly differentiated', 'Moderately differentiated', 'Well differentiated']
grade_order = ['I', 'II','III', 'IV']
# Apply the custom order to the categorical variables
breast_cancer['T Stage '] = pd.Categorical(breast_cancer['T Stage '], categories=t_stage_order, ordered=True)
breast_cancer['N Stage'] = pd.Categorical(breast_cancer['N Stage'], categories=n_stage_order, ordered=True)
breast_cancer['6th Stage'] = pd.Categorical(breast_cancer['6th Stage'], categories=stage_6_order, ordered=True)
breast_cancer['differentiate'] = pd.Categorical(breast_cancer['differentiate'], categories=differentiate_order, ordered=True)
breast_cancer['Grade'] = pd.Categorical(breast_cancer['Grade'], categories=grade_order, ordered=True)
# Print the updated order of the categorical variables
print("Categorical Variables with Custom Order:")
print("T Stage:", breast_cancer['T Stage '].cat.categories)
print("N Stage:", breast_cancer['N Stage'].cat.categories)
print("6th Stage:", breast_cancer['6th Stage'].cat.categories)
print("differentiate:", breast_cancer['differentiate'].cat.categories)
print("Grade:", breast_cancer['Grade'].cat.categories)
Categorical Variables with Custom Order:
T Stage: Index(['T1', 'T2', 'T3', 'T4'], dtype='object')
N Stage: Index(['N1', 'N2', 'N3'], dtype='object')
6th Stage: Index(['IIA', 'IIB', 'IIIA', 'IIIB', 'IIIC'], dtype='object')
differentiate: Index(['Undifferentiated', 'Poorly differentiated',
'Moderately differentiated', 'Well differentiated'],
dtype='object')
Grade: Index(['I', 'II', 'III', 'IV'], dtype='object')
import warnings
# Get the summary of categorical data
summary = pd.DataFrame(columns=['Variable', 'Value', 'Count'])
# Iterate over each column
for column in breast_cancer.columns:
# Check if the column is categorical
if breast_cancer[column].dtype in ['object', 'category']:
# Calculate the value counts
value_counts = breast_cancer[column].value_counts().reset_index()
value_counts.columns = ['Value', 'Count']
# Add the variable name to the summary
value_counts['Variable'] = column
# Append to the summary DataFrame
summary = pd.concat([summary, value_counts], ignore_index=True)
# Print the summary
print(summary)
Variable Value Count 0 Race White 3408 1 Race Other 320 2 Race Black 291 3 Marital Status Married 2639 4 Marital Status Single 615 5 Marital Status Divorced 485 6 Marital Status Widowed 235 7 Marital Status Separated 45 8 T Stage T2 1786 9 T Stage T1 1603 10 T Stage T3 529 11 T Stage T4 101 12 N Stage N1 2731 13 N Stage N2 817 14 N Stage N3 471 15 6th Stage IIA 1305 16 6th Stage IIB 1130 17 6th Stage IIIA 1047 18 6th Stage IIIC 471 19 6th Stage IIIB 66 20 differentiate Moderately differentiated 2348 21 differentiate Poorly differentiated 1110 22 differentiate Well differentiated 542 23 differentiate Undifferentiated 19 24 Grade II 2348 25 Grade III 1110 26 Grade I 542 27 Grade IV 19 28 A Stage Regional 3927 29 A Stage Distant 92 30 Estrogen Status Positive 3750 31 Estrogen Status Negative 269 32 Progesterone Status Positive 3321 33 Progesterone Status Negative 698 34 Status Alive 3404 35 Status Dead 615
From all the variables we have, we wanted to extract some new variables. Precisely, we will extract three categorical and one continuous (numerical):
Age Group: We will group the age between 'Young', 'Middle-aged' and 'Elderly'Race Group: As there is a big gap between the number of pacients from different races, we will try to minimize a bit indicating if they are 'Caucasian' or 'Non-caucasian'Tumor Stage Group: We will group T1 into 'Early Stage' and T2, T3 and T4 into 'Advanced Stage'.The continuous one will be the following:
Age at diagnosis: When was the patient diagnosedPositive lymph Ratio: Ratio of positive to examined lymph nodes# Create a new column for Age Group based on custom age ranges
breast_cancer['Age Group'] = pd.cut(breast_cancer['Age'], bins=[30, 50, 60, np.inf], labels=['Young', 'Middle-aged', 'Elderly'])
# Print the updated dataframe with the Age Group variable
print(breast_cancer[['Age', 'Age Group']])
print(breast_cancer['Age Group'].value_counts())
Age Age Group 0 68 Elderly 1 50 Young 2 58 Middle-aged 3 58 Middle-aged 4 47 Young ... ... ... 4014 62 Elderly 4015 56 Middle-aged 4016 68 Elderly 4017 58 Middle-aged 4018 46 Young [4019 rows x 2 columns] Age Group Young 1490 Middle-aged 1385 Elderly 1139 Name: count, dtype: int64
In order to have balanced data in this new variable, the boundaries set were the following:
# Create a new column for broader race groups
breast_cancer['Race Group'] = ''
# Assign the broader race group based on Race values
breast_cancer['Race Group'] = np.where(breast_cancer['Race'] == 'White', 'Caucasian', breast_cancer['Race Group'])
breast_cancer['Race Group'] = np.where(breast_cancer['Race'].isin(['Other', 'Black']), 'Non-Caucasian', breast_cancer['Race Group'])
# Print the updated dataframe with the Race and Race Group variables
print(breast_cancer[['Race', 'Race Group']])
print(breast_cancer['Race Group'].value_counts())
Race Race Group 0 White Caucasian 1 White Caucasian 2 White Caucasian 3 White Caucasian 4 White Caucasian ... ... ... 4014 Other Non-Caucasian 4015 White Caucasian 4016 White Caucasian 4017 Black Non-Caucasian 4018 White Caucasian [4019 rows x 2 columns] Race Group Caucasian 3408 Non-Caucasian 611 Name: count, dtype: int64
In this second variable we consider splitting between 'Caucasian' and 'Non-caucasian' race. Even though we minimize the gap between races, it is still extense. We keep it for now.
# Create a new column for Tumor Stage Group
breast_cancer['Tumor Stage Group'] = ''
# Assign the Tumor Stage Group based on T Stage values
breast_cancer['Tumor Stage Group'] = np.where(breast_cancer['T Stage ']=='T1', 'Early Stage', breast_cancer['Tumor Stage Group'])
breast_cancer['Tumor Stage Group'] = np.where(breast_cancer['T Stage '].isin([ 'T2', 'T3', 'T4']), 'Advanced Stage', breast_cancer['Tumor Stage Group'])
# Print the updated dataframe with the Tumor Stage Group variable
print(breast_cancer[['T Stage ', 'Tumor Stage Group']])
print(breast_cancer['Tumor Stage Group'].value_counts())
T Stage Tumor Stage Group 0 T1 Early Stage 1 T2 Advanced Stage 2 T3 Advanced Stage 3 T1 Early Stage 4 T2 Advanced Stage ... ... ... 4014 T1 Early Stage 4015 T2 Advanced Stage 4016 T2 Advanced Stage 4017 T2 Advanced Stage 4018 T2 Advanced Stage [4019 rows x 2 columns] Tumor Stage Group Advanced Stage 2416 Early Stage 1603 Name: count, dtype: int64
#Continuous
breast_cancer['Age at Diagnosis'] = breast_cancer['Age'] - (breast_cancer['Survival Months']//12)
breast_cancer['Positive Lymph Ratio'] = breast_cancer['Regional Node Positive'] / breast_cancer['Regional Node Examined']
breast_cancer.head(4)
| Age | Race | Marital Status | T Stage | N Stage | 6th Stage | differentiate | Grade | A Stage | Tumor Size | ... | Progesterone Status | Regional Node Examined | Regional Node Positive | Survival Months | Status | Age Group | Race Group | Tumor Stage Group | Age at Diagnosis | Positive Lymph Ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 68 | White | Married | T1 | N1 | IIA | Poorly differentiated | III | Regional | 4 | ... | Positive | 24 | 1 | 60 | Alive | Elderly | Caucasian | Early Stage | 63 | 0.041667 |
| 1 | 50 | White | Married | T2 | N2 | IIIA | Moderately differentiated | II | Regional | 35 | ... | Positive | 14 | 5 | 62 | Alive | Young | Caucasian | Advanced Stage | 45 | 0.357143 |
| 2 | 58 | White | Divorced | T3 | N3 | IIIC | Moderately differentiated | II | Regional | 63 | ... | Positive | 14 | 7 | 75 | Alive | Middle-aged | Caucasian | Advanced Stage | 52 | 0.500000 |
| 3 | 58 | White | Married | T1 | N1 | IIA | Poorly differentiated | III | Regional | 18 | ... | Positive | 2 | 1 | 84 | Alive | Middle-aged | Caucasian | Early Stage | 51 | 0.500000 |
4 rows × 21 columns
#generate_cross_tabulations(breast_cancer, 'Status')
plot_dataframe(breast_cancer,3,7)
One comment here that we would like to point out, is that we tried to normalize the variables Tumor Size and Regional Node Positive but the results of the sharipo test again give negative results. The code is excluded from this final deliverable.
bivariate_numerical_exploratory_analysis(breast_cancer, 'Status')
plot_categorical_variables(breast_cancer, "Status")
correlation_heatmap(breast_cancer)
By following the same logic described during the Univariate and Bivariate analysis, conclusions can be derived for the new generated features.
To prevent an oredered learning from our model, we will shuffle the data. Also, as we do not have a huge dataset and compression is not necessary, we will save the new data into a csv file.
It's important to note here that, the preprocessing (applying BoxCox transformation to Tumor Size and Positive Lymph Ratio, or One-hot encoding to categorical variables) as well as the normalization of the data with the Min-Max Scaler is taking place in the modelling notebooks, in order to be able to apply the transformations and the normalisation techniques on the training and test datasets separately.
np.random.seed(666)
breast_cancer_new = breast_cancer.sample(frac=1).reset_index(drop=True)
breast_cancer_new.to_csv('breast_cancer_new.csv', index=False)
breast_cancer_read = read_csv("breast_cancer_new.csv", header=0, delimiter=',')
breast_cancer_read.head(4)
| Age | Race | Marital Status | T Stage | N Stage | 6th Stage | differentiate | Grade | A Stage | Tumor Size | ... | Progesterone Status | Regional Node Examined | Regional Node Positive | Survival Months | Status | Age Group | Race Group | Tumor Stage Group | Age at Diagnosis | Positive Lymph Ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 57 | White | Married | T4 | N3 | IIIC | Poorly differentiated | III | Distant | 85 | ... | Positive | 31 | 18 | 41 | Alive | Middle-aged | Caucasian | Advanced Stage | 54 | 0.580645 |
| 1 | 47 | White | Married | T2 | N1 | IIB | Moderately differentiated | II | Regional | 23 | ... | Positive | 6 | 5 | 50 | Alive | Young | Caucasian | Advanced Stage | 43 | 0.833333 |
| 2 | 37 | White | Single | T2 | N1 | IIB | Moderately differentiated | II | Regional | 23 | ... | Positive | 17 | 3 | 71 | Alive | Young | Caucasian | Advanced Stage | 32 | 0.176471 |
| 3 | 45 | White | Married | T1 | N1 | IIA | Poorly differentiated | III | Regional | 20 | ... | Positive | 15 | 1 | 97 | Alive | Young | Caucasian | Early Stage | 37 | 0.066667 |
4 rows × 21 columns